This notebook defines the most focal recurrent copy number units by removing focal changes that are within entire chromosome arm losses and gains. Most focal here meaning:
This notebook is intended to be run from the command line with the following (assumes you are in the root directory of the repository):
Rscript -e "rmarkdown::render('analyses/focal-cn-file-preparation/05-define-most-focal-cn-units.Rmd', clean = TRUE)"
# The percentage of calls a particular status needs to be
# above to be called the majority status -- the decision
# for a cutoff of 90% here was made to ensure that the status
# is not only the majority status but it is also significantly
# called more than the other status values in the region
percent_threshold <- 0.9
# The percentage threshold for determining if enough of a region
# (arm, cytoband, or gene) is callable to determine its status --
# the decision for a cutoff of 50% here was made as it seems reasonable
# to expect a region to be more than 50% callable for a dominant status
# call to be made
uncallable_threshold <- 0.5
library(tidyverse)
status_majority_caller <- function(status_df,
region_variable,
status_column_name,
percent_threshold_value,
uncallable_threshold_value = uncallable_threshold) {
# Given a data.frame with cytoband/gene-level copy number status data,
# find the dominant status of the cytoband/gene region by calculating
# the percentages of the region that each call represents.
#
# Args:
# status_df: data.frame with cytoband/gene-level copy number status data
# region_variable: string of the column name/region to calculate copy number
# status percentages for
# status_column_name: string of the column name that holds the relevant copy
# number status data
# percent_threshold_value: What percent of calls a particular status needs to be
# above to be called the majority.
# uncallable_threshold_value: a threshold for determining if enough of a region is
# callable to determine its status.
#
# Return:
# status_count: data.frame with percentage values for each unique status in
# each unique region (arm/cytoband/gene) and the dominant
# status for that region
# Tidyeval for these columns
region_sym <- rlang::sym(region_variable)
status_sym <- rlang::sym(status_column_name)
# Format the data and group it
status_count <- status_df %>%
count(!!region_sym, !!status_sym) %>%
# Spread the data -- each row represents a unique chromosome arm
spread(!!status_sym, n) %>%
# Turn NAs into 0s
replace_na(list(
gain = 0,
loss = 0,
neutral = 0,
amplification = 0,
uncallable = 0,
unstable = 0
)) %>%
# Getting counts by region
group_by(!!region_sym)
# Let's store the regions separately so as to avoid weird coercions
region_vector <-
status_count %>%
dplyr::pull(!!region_sym)
# Calculate percent
status_count <- status_count %>%
# Store region variable column out of the way as rownames
tibble::column_to_rownames(region_variable) %>%
# Obtain a total counts variable column
dplyr::mutate(total = apply(., 1, sum)) %>%
# Get the ratio of each status count to the total
dplyr::mutate_at(dplyr::vars(-total), dplyr::funs(. / total)) %>%
# Bring back our region variable as its own column
dplyr::mutate(!!region_variable := region_vector)
# The logic here is to define the region status based on the majority of calls
# in the region -- if the number of calls for a specific status is
# responsible for more than `percent_threshold` value of the total calls in that
# region, then it is used to define the region's status (exception is for the
# uncallable status where we define the region as uncallable when more than the
# `uncallable_threshold` of the calls in that region are uncallable)
if ((region_variable == "chromosome_arm") | (region_variable == "gene_symbol")) {
status_count <- status_count %>%
mutate(
dominant_status = case_when(
gain > percent_threshold ~ "gain",
loss > percent_threshold ~ "loss",
amplification > percent_threshold ~ "amplification",
TRUE ~ "neutral"
)
)
} else if (region_variable == "cytoband") {
status_count <- status_count %>%
mutate(
dominant_status = case_when(
uncallable > uncallable_threshold ~ "uncallable",
gain > percent_threshold~ "gain",
loss > percent_threshold ~ "loss",
neutral > percent_threshold ~ "neutral",
TRUE ~ "unstable"
)
)
}
return(status_count)
}
plot_dominant_status_calls <- function(status_count_df,
region_variable) {
# Given a data.frame with the percentage values for each region and the
# dominant status for that region, plot the dominant status call on the
# x-axis with the percentage values on the y-axis.
#
# Args:
# status_count_df: data.frame with percentage values for each unique status
# in each unique region (arm/cytoband/gene) and the
# dominant status for that region
# region_variable: string of the region (which will also be a column name)
# that the data.frame holds percentage values for
# Return:
# status_plot: plot representing the dominant status call on the x-axis and
# the percentage values on the y-axis
status_count_df %>%
# Remove the total column -- we don't want to plot this
select(-total) %>%
dplyr::ungroup() %>%
# Store the non-percentage value column variable as rownames
tibble::column_to_rownames(region_variable) %>%
# Gather the data.frame to have columns and values in the format of
# our status call, the percentage of total calls that status call is
# responisble for, and the dominant status call made based on the
# percentage value
tidyr::gather(status, percent,-dominant_status) %>%
# Plot our focal status values on the x-axis and the percentage values
# on the y-axis
ggplot2::ggplot(ggplot2::aes(x = status,
y = percent)) +
ggplot2::geom_jitter() +
# Facet wrap around each dominant status value
ggplot2::facet_wrap( ~ dominant_status)
}
results_dir <- "results"
# Define a logical object for running in CI
running_in_ci <- params$is_ci
Read in cytoband status file and format it for what we will need in this notebook.
# Read in the file with consensus CN status data and the UCSC cytoband data --
# generated in `03-add-cytoband-status-consensus.Rmd`
consensus_seg_cytoband_status_df <-
read_tsv(file.path("results", "consensus_seg_with_ucsc_cytoband_status.tsv.gz")) %>%
# Need this to not have `chr`
mutate(chr = gsub("chr", "", chr),
cytoband = paste0(chr, cytoband)) %>%
select(
chromosome_arm,
# Distinguish this dominant status that is based on cytobands, from the status
dominant_cytoband_status = dominant_status,
cytoband,
Kids_First_Biospecimen_ID
)
Parsed with column specification:
cols(
Kids_First_Biospecimen_ID = [31mcol_character()[39m,
chr = [31mcol_character()[39m,
cytoband = [31mcol_character()[39m,
dominant_status = [31mcol_character()[39m,
band_length = [32mcol_double()[39m,
callable_fraction = [32mcol_double()[39m,
gain_fraction = [32mcol_double()[39m,
loss_fraction = [32mcol_double()[39m,
chromosome_arm = [31mcol_character()[39m
)
Read in the chromosome-level and gene-level data.
# Read in the annotated CN file (without the UCSC data)
consensus_seg_autosomes_df <-
read_tsv(file.path(results_dir, "consensus_seg_annotated_cn_autosomes.tsv.gz"))
Parsed with column specification:
cols(
biospecimen_id = [31mcol_character()[39m,
status = [31mcol_character()[39m,
copy_number = [32mcol_double()[39m,
ploidy = [32mcol_double()[39m,
ensembl = [31mcol_character()[39m,
gene_symbol = [31mcol_character()[39m,
cytoband = [31mcol_character()[39m
)
Joining the gene-level, cytoband-level, and arm-level data into one data frame.
combined_status_df <- consensus_seg_autosomes_df %>%
inner_join(
consensus_seg_cytoband_status_df,
by = c("biospecimen_id" = "Kids_First_Biospecimen_ID", "cytoband")
)
# Use our custom function to make the status calls
arm_status_count <- status_majority_caller(
combined_status_df,
"chromosome_arm",
"status",
percent_threshold = percent_threshold
)
# Display table
arm_status_count %>%
group_by(dominant_status)
These are the chromosome arms that have not been defined as gain or loss – we want to define their cytoband/gene-level status
# Let's get a vector of the neutral arms
neutral_arms <- arm_status_count %>%
filter(dominant_status == "neutral") %>%
dplyr::pull("chromosome_arm")
We want to include cytoband and gene-level calls for chromosome arms that have not been defined as a gain or loss.
# Filter the annotated CN data to include only neutral chromosome arms
neutral_status_arm_df <- combined_status_df %>%
filter(chromosome_arm %in% neutral_arms)
Making cytoband-level majority calls.
if (!(running_in_ci)) {
# Now count the cytoband level calls (for each status call) and define
# the cytoband as that status if more than 50% of the total counts are
# for that particular status
cytoband_status_count <- status_majority_caller(
neutral_status_arm_df,
"cytoband",
"dominant_cytoband_status",
percent_threshold = percent_threshold
)
# Display table
cytoband_status_count
}
if (!(running_in_ci)) {
# These are the cytobands that have not been defined as gain or loss --
# we want to define their gene-level status
neutral_cytobands <- cytoband_status_count %>%
filter(dominant_status %in% c("unstable", "neutral")) %>%
dplyr::pull("cytoband")
# Filter the annotated CN data to include only these cytobands
neutral_status_cytoband_df <- combined_status_df %>%
filter(cytoband %in% neutral_cytobands)
# Now count the gene-level calls (for each status call) and define
# the gene as that status if more than 50% of the total counts are
# for that particular status
gene_status_count <- status_majority_caller(neutral_status_cytoband_df,
"gene_symbol",
"status",
percent_threshold = percent_threshold)
# Display table
gene_status_count
}
Plot the final dominant status call on the x-axis and the percent of each status on the y-axis.
# Run `plot_dominant_status_calls` function for the chromosome arm calls
plot_dominant_status_calls(
arm_status_count,
"chromosome_arm"
)
# Run `plot_dominant_status_calls` function for the cytoband calls if not
# running in CI
if (!(running_in_ci)) {
plot_dominant_status_calls(cytoband_status_count,
"cytoband")
}
# Plot if not running in circleCI
if (!(running_in_ci)) {
plot_dominant_status_calls(gene_status_count,
"gene_symbol")
}
# The logic variable `running_in_ci` is needed here because the CI testing
# files do not contain any of the genes in the `gene_status_count` data.frame
# generated above (when `running_in_ci` == FALSE)
if (!(running_in_ci)) {
combined_status_count_df <- consensus_seg_autosomes_df %>%
mutate(chromosome_arm = gsub("(p|q).*", "\\1", cytoband)) %>%
inner_join(arm_status_count,
by = "chromosome_arm") %>%
left_join(cytoband_status_count,
by = "cytoband",
suffix = c(".arm", ".cytoband")) %>%
left_join(gene_status_count,
by = "gene_symbol",
suffix = c(".arm", ".gene")) %>%
mutate(
focal_call = paste0(
dominant_status.arm,
", ",
dominant_status.cytoband,
", ",
dominant_status
),
# Here we want to define the most focal call based on the arm, cytoband,
# and gene status information -- if a loss/gain is defined at the arm
# level then the focal call will be "arm_loss" or "arm_gain" respectively,
# and so on.
focal_call = case_when(focal_call == "loss, NA, NA" ~ "arm_loss",
focal_call == "neutral, uncallable, NA" ~ "uncallable",
focal_call == "neutral, loss, NA" ~ "cytoband_loss",
focal_call == "neutral, NA, NA" ~ "arm_neutral",
focal_call == "neutral, unstable, neutral" ~ "gene_neutral",
focal_call == "neutral, unstable, loss" ~ "gene_loss",
TRUE ~ "Other")
)
} else {
combined_status_count_df <- consensus_seg_autosomes_df %>%
mutate(chromosome_arm = gsub("(p|q).*", "\\1", cytoband)) %>%
inner_join(arm_status_count, by = "chromosome_arm") %>%
mutate(focal_call = dominant_status)
}
# Display combined status counts table
combined_status_count_df %>%
arrange(focal_call) %>%
group_by(biospecimen_id, focal_call)
if (!(running_in_ci)) {
final_arm_status_df <- combined_status_count_df %>%
# Filter to only non-neutral chromosome arms
filter(dominant_status.arm != "neutral") %>%
select(
Kids_First_Biospecimen_ID = biospecimen_id,
region = chromosome_arm,
status = dominant_status.arm
) %>%
distinct() %>%
mutate(region_type = "chromosome_arm")
} else {
final_arm_status_df <- combined_status_count_df %>%
# Filter to only non-neutral chromosome arms
filter(dominant_status != "neutral") %>%
select(
Kids_First_Biospecimen_ID = biospecimen_id,
region = chromosome_arm,
status = dominant_status
) %>%
distinct() %>%
mutate(region_type = "chromosome_arm")
}
if (!(running_in_ci)) {
final_cytoband_status_df <- combined_status_count_df %>%
# Filter to only neutral chromosome arms and cytobands
# that are not NA
filter(dominant_status.arm == "neutral",
dominant_status.cytoband != "NA") %>%
select(
Kids_First_Biospecimen_ID = biospecimen_id,
region = cytoband,
status = dominant_status.cytoband
) %>%
distinct() %>%
mutate(region_type = "cytoband")
}
if (!(running_in_ci)) {
final_gene_status_df <- combined_status_count_df %>%
# Filter to only neutral chromosome arms and cytobands
# that are NA or unstable (do not have a clear gain or loss call)
filter(dominant_status.arm == "neutral",
dominant_status.cytoband == c("NA", "unstable")) %>%
select(
Kids_First_Biospecimen_ID = biospecimen_id,
region = gene_symbol,
status = dominant_status
) %>%
distinct() %>%
mutate(region_type = "gene_symbol")
}
if (!(running_in_ci)) {
# Bind the rows of each region's data.frame
final_long_status_df <- bind_rows(final_arm_status_df,
final_cytoband_status_df,
final_gene_status_df) %>%
filter(status != "uncallable")
} else {
final_long_status_df <- final_arm_status_df %>%
filter(status != "uncallable")
}
# Write final long status table to file
write_tsv(final_long_status_df, file.path(results_dir, "consensus_seg_most_focal_cn_status.tsv.gz"))
# Display final long status table
final_long_status_df %>%
arrange(region_type)
Note: There are gain status calls in this final data.frame. This is not completely unexpected as there were many loss calls noticed in the data to begin with, but this may be something that is worth taking note of.
final_wide_status_df <- final_long_status_df %>%
tidyr::spread(Kids_First_Biospecimen_ID, status)
# Display status data spread across samples in wide format -- each row is a
# unique region
final_wide_status_df